packages = c('tidyverse', 'kableExtra', 'plotly', 'lubridate', 'pacman')
for (p in packages){
if (!require(p,character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
p_load(tidyverse,lubridate,showtext)
showtext_auto()
font_add_google("Bebas Neue", "Bebas Neue")
df <- read_csv('dataset/netflix_titles.csv')
Rows: 8807 Columns: 12── Column specification ───────────────────────────────────────────────────────
Delimiter: ","
chr (11): show_id, type, title, director, cast, country, date_added, rating...
dbl (1): release_year
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
3.1 Өгөгдөл багсгалт
# Хэрэгцээгүй утгуудыг цэвэрлэх явц
df <- tibble::as_tibble(df) %>%
select(-c( description))
# дата шалтгалт
kable(df[1:10,]) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
scroll_box(width = "100%", height = "500px")
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in |
|---|---|---|---|---|---|---|---|---|---|---|
| s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NA | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries |
| s2 | TV Show | Blood & Water | NA | Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries |
| s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera | NA | September 24, 2021 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Action & Adventure |
| s4 | TV Show | Jailbirds New Orleans | NA | NA | NA | September 24, 2021 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV |
| s5 | TV Show | Kota Factory | NA | Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV Comedies |
| s6 | TV Show | Midnight Mass | Mike Flanagan | Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver | NA | September 24, 2021 | 2021 | TV-MA | 1 Season | TV Dramas, TV Horror, TV Mysteries |
| s7 | Movie | My Little Pony: A New Generation | Robert Cullen, José Luis Ucha | Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr | NA | September 24, 2021 | 2021 | PG | 91 min | Children & Family Movies |
| s8 | Movie | Sankofa | Haile Gerima | Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri | United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia | September 24, 2021 | 1993 | TV-MA | 125 min | Dramas, Independent Movies, International Movies |
| s9 | TV Show | The Great British Baking Show | Andy Devonshire | Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood | United Kingdom | September 24, 2021 | 2021 | TV-14 | 9 Seasons | British TV Shows, Reality TV |
| s10 | Movie | The Starling | Theodore Melfi | Melissa McCarthy, Chris O'Dowd, Kevin Kline, Timothy Olyphant, Daveed Diggs, Skyler Gisondo, Laura Harrier, Rosalind Chao, Kimberly Quinn, Loretta Devine, Ravi Kapoor | United States | September 24, 2021 | 2021 | PG-13 | 104 min | Comedies, Dramas |
3.2 Хоосон утгуудыг цэвэрлэх, цэгцлэх
# Хоосон утгатай өгөгдлийг илрүүлэх
data.frame(var = c(colnames(df)),
missing = sapply(df, function(x) sum(is.na(x))), row.names = NULL) %>%
mutate(missing = cell_spec(missing, "html",
color = ifelse(missing > 0, 'red', 'black'))) %>%
rename(`Баганын нэр` = var, `Хоосон утгын тоо` = missing) %>%
kable(format = "html", escape = F, align = c("l", "c")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| Баганын нэр | Хоосон утгын тоо |
|---|---|
| show_id | 0 |
| type | 0 |
| title | 0 |
| director | 2634 |
| cast | 825 |
| country | 831 |
| date_added | 10 |
| release_year | 0 |
| rating | 4 |
| duration | 3 |
| listed_in | 0 |
# Мод олох функч
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
df$rating[is.na(df$rating)] <- getmode(df$rating)
# давхардсан өгөгдөлүүдийг цэвэхлэх -> title, country, type болон release_year
df <- distinct(df, type, title, country, release_year, show_id, listed_in, .keep_all = T)
3.3 Он сарын порматын өөрчлөлт
# он сарын утгын порматын өөрчлөлт
df$date_added <- as.Date(df$date_added, format = "%B %d, %Y")
# он сарын порматын шалгалт
kable(df[1:10,]) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(6, color = 'red', bold = T) %>%
scroll_box(width = "100%", height = "352px")
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in |
|---|---|---|---|---|---|---|---|---|---|---|
| s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NA | United States | 2021-09-25 | 2020 | PG-13 | 90 min | Documentaries |
| s2 | TV Show | Blood & Water | NA | Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng | South Africa | 2021-09-24 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries |
| s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera | NA | 2021-09-24 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Action & Adventure |
| s4 | TV Show | Jailbirds New Orleans | NA | NA | NA | 2021-09-24 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV |
| s5 | TV Show | Kota Factory | NA | Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar | India | 2021-09-24 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV Comedies |
| s6 | TV Show | Midnight Mass | Mike Flanagan | Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver | NA | 2021-09-24 | 2021 | TV-MA | 1 Season | TV Dramas, TV Horror, TV Mysteries |
| s7 | Movie | My Little Pony: A New Generation | Robert Cullen, José Luis Ucha | Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr | NA | 2021-09-24 | 2021 | PG | 91 min | Children & Family Movies |
| s8 | Movie | Sankofa | Haile Gerima | Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri | United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia | 2021-09-24 | 1993 | TV-MA | 125 min | Dramas, Independent Movies, International Movies |
| s9 | TV Show | The Great British Baking Show | Andy Devonshire | Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood | United Kingdom | 2021-09-24 | 2021 | TV-14 | 9 Seasons | British TV Shows, Reality TV |
| s10 | Movie | The Starling | Theodore Melfi | Melissa McCarthy, Chris O'Dowd, Kevin Kline, Timothy Olyphant, Daveed Diggs, Skyler Gisondo, Laura Harrier, Rosalind Chao, Kimberly Quinn, Loretta Devine, Ravi Kapoor | United States | 2021-09-24 | 2021 | PG-13 | 104 min | Comedies, Dramas |
4.1 Агуулгын төрлөөр эзлэх хувь
content_by_type <- df %>% group_by(type) %>%
summarise(count = n())
plot_ly(content_by_type, labels = ~type, values = ~count,
type = 'pie', marker = list(colors = c("#bd3939", "#399ba3"))) %>%
layout(xaxis = list(showgrid = F, zeroline = F, showticklabels = F),
yaxis = list(showgrid = F, zeroline = F, showticklabels = F),
title = "Агуулгын төрлөөр эзлэх хувь", margin = list(t = 54),
legend = list(x = 100, y = 0.5))
4.2 Үйлдвэрлэсэн агуулгын хэмжээгээр эхний 12 улс
s <- strsplit(df$country, split = ", ")
cntry_split <- data.frame(type = rep(df$type, sapply(s, length)), country = unlist(s))
cntry_split$country <- as.character(gsub(",","", cntry_split$country))
amount_by_country <- na.omit(cntry_split) %>%
group_by(country, type) %>%
summarise(count = n())
`summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
w <- reshape(data = data.frame(amount_by_country),
idvar = "country",
v.names = "count",
timevar = "type",
direction = "wide") %>%
arrange(desc(count.Movie)) %>% top_n(12)
Selecting by count.TV Show
names(w)[2] <- "count_movie"
names(w)[3] <- "count_tv_show"
w <- w[order(desc(w$count_movie + w$count_tv_show)),]
plot_ly(w, x = w$country, y = ~count_movie,
type = 'bar', name = 'Movie', marker = list(color = '#bd3939')) %>%
add_trace(y = ~count_tv_show, name = 'TV Show', marker = list(color = '#399ba3')) %>%
layout(xaxis = list(categoryorder = "array",
categoryarray = w$country,
title = "Улс"),
yaxis = list(title = 'Өгөгдлийн тоо'),
barmode = 'stack',
title = "Үйлдвэрлэсэн агуулгын хэмжээгээр эхний 12 улс", margin = list(t = 54),
legend = list(x = 100, y = 0.5))
4.3 Жилийн болгоны агуулгын өсөлт
df_by_date_full <- df %>% group_by(date_added) %>%
summarise(added_today = n()) %>%
mutate(total_number_of_content = cumsum(added_today), type = "Total")
df_by_date <- df %>% group_by(date_added, type) %>%
summarise(added_today = n()) %>%
ungroup() %>%
group_by(type) %>%
mutate(total_number_of_content = cumsum(added_today))
`summarise()` has grouped output by 'date_added'. You can override using the `.groups` argument.
full_data <- rbind(as.data.frame(df_by_date_full), as.data.frame(df_by_date))
plot_ly(full_data, x = ~date_added, y = ~total_number_of_content,
mode = 'lines', type = 'scatter',
color = ~type, colors = c("#bd3939", "#9addbd", "#399ba3")) %>%
layout(yaxis = list(title = 'Тоо'),
xaxis = list(title = 'он'),
title = "Жилийн болгоны агуулгын өсөлт", margin = list(t = 54),
legend = list(x = 100, y = 0.5))
4.4 Сард нэмэгдсэн агуулгын хэмжээ
df_by_date_month <- df %>% group_by(month_added = floor_date(date_added, "month"), type) %>%
summarise(added_today = n())
`summarise()` has grouped output by 'month_added'. You can override using the `.groups` argument.
wd <- reshape(data = data.frame(df_by_date_month),
idvar = "month_added",
v.names = "added_today",
timevar = "type",
direction = "wide")
names(wd)[2] <- "added_today_movie"
names(wd)[3] <- "added_today_tv_show"
wd$added_today_movie[is.na(wd$added_today_movie)] <- 0
wd$added_today_tv_show[is.na(wd$added_today_tv_show)] <- 0
wd <-na.omit(wd)
plot_ly(wd, x = wd$month_added, y = ~added_today_movie,
type = 'bar', name = 'Movie',
marker = list(color = '#bd3939')) %>%
add_trace(y = ~added_today_tv_show,
name = 'TV Show',
marker = list(color = '#399ba3')) %>%
layout(xaxis = list(categoryorder = 'array',
categoryarray = wd$month_added,
title = 'Он'),
yaxis = list(title = 'Тоо'),
barmode = 'stack',
title = "Сард нэмэгдсэн агуулгын хэмжээ", margin = list(t = 54),
legend = list(x = 100, y = 0.5))
4.5 Үнэлгээгээр агуулгын хуваарилалт
df_by_rating <- df %>% group_by(rating) %>%
summarise(count = n())
plot_ly(df_by_rating, type = 'pie',
labels = ~rating, values = ~count) %>%
layout(xaxis = list(showgrid = F, zeroline = F, showticklabels = F),
yaxis = list(showgrid = F, zeroline = F, showticklabels = F),
title = "Үнэлгээгээр агуулгын хуваарилалт", margin = list(t = 54),
legend = list(x = 100, y = 0.5))
4.6 Шилдэг төрлүүд (Кино болон ТВ шоу)
s_genres <- strsplit(df$listed_in, split = ", ")
genres_listed_in <- data.frame(type = rep(df$type, sapply(s_genres, length)),
listed_in = unlist(s_genres))
genres_listed_in$listed_in <- as.character(gsub(",","",genres_listed_in$listed_in))
df_by_listed_in <- genres_listed_in %>%
group_by(type, listed_in) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>% top_n(10)
`summarise()` has grouped output by 'type'. You can override using the `.groups` argument.Selecting by count
plot_ly(df_by_listed_in, x = ~listed_in, y = ~count,
type = 'bar', color = ~type,
colors = c("#bd3939", "#399ba3")) %>%
layout(xaxis = list(categoryorder = "array",
categoryarray = df_by_listed_in$listed_in,
title = 'Төрөл',
tickangle = 45),
yaxis = list(title = 'Тоо хэмжээ'),
title = "Шилдэг төрлүүд (Кино болон ТВ шоу)", margin = list(t = 54),
legend = list(x = 100, y = 0.5))
4.7 Шилдэг 12 орны киноны үргэлжлэх хугацаа
mov_duration_cntry <- na.omit(df[df$type == "Movie",][,c("country", "duration")])
s_dur <- strsplit(mov_duration_cntry$country, split = ", ")
duration_full <- data.frame(duration = rep(mov_duration_cntry$duration,
sapply(s_dur, length)),
country = unlist(s_dur))
duration_full$duration <- as.numeric(gsub(" min","", duration_full$duration))
duration_full_subset <- duration_full[duration_full$country %in%
c("United States", "India", "United Kingdom",
"Canada", "France", "Japan", "Spain", "South Korea",
"Mexico", "Australia", "China", "Taiwan"),]
plot_ly(duration_full_subset, y = ~duration, color = ~country, type = "box") %>%
layout(xaxis = list(title = "Улс"),
yaxis = list(title = 'Хугацаа (минутаар)'),
title = "Шилдэг 12 орны киноны үргэлжлэх хугацаа", margin = list(t = 54),
legend = list(x = 100, y = 0.5))
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning: n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
4.8 Ангилалуудыг харуулах: Корреляци ба хамгийн түгээмэл
df_show_categories <- df %>%
select(c('show_id','type','listed_in')) %>%
separate_rows(listed_in, sep = ',') %>%
rename(Show_Category = listed_in)
df_show_categories$Show_Category <- trimws(df_show_categories$Show_Category)
head(df_show_categories)
NA
df_show_categories %>% mutate(Show_Category = fct_infreq(Show_Category)) %>%
ggplot(aes(x = Show_Category)) +
geom_bar() + scale_x_discrete() + facet_wrap(~type, scales = 'free_x') + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + coord_cartesian(xlim = c(1,20))
df_unique_categories <- df_show_categories %>% group_by(type,Show_Category) %>% summarise()
`summarise()` has grouped output by 'type'. You can override using the `.groups` argument.
df_category_correlations_movies <- data.frame(expand_grid(type = 'Movie',
Category1 = subset(df_unique_categories, type == 'Movie')$Show_Category,
Category2 = subset(df_unique_categories, type == 'Movie')$Show_Category))
df_category_correlations_TV <- data.frame(expand_grid(type = 'TV Show',
Category1 = subset(df_unique_categories, type == 'TV Show')$Show_Category,
Category2 = subset(df_unique_categories, type == 'TV Show')$Show_Category))
df_category_correlations <- rbind(df_category_correlations_movies,df_category_correlations_TV)
df_category_correlations$matched_count <- apply(df_category_correlations, MARGIN = 1,FUN = function(x) {
length(intersect(subset(df_show_categories, type == x['type'] & Show_Category == x['Category1'])$show_id,
subset(df_show_categories, type == x['type'] & Show_Category == x['Category2'])$show_id))})
df_category_correlations <- subset(df_category_correlations, (as.character(Category1) < as.character(Category2)) & (matched_count > 0))
# Change plot size to 8 x 3
options(repr.plot.width=14, repr.plot.height=10)
ggplot(subset(df_category_correlations, type == 'Movie'), aes(x = Category1, y = Category2, fill = matched_count)) +
geom_tile() + facet_wrap( ~type, scales = 'free') + scale_fill_distiller(palette = "Spectral") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.text = element_text(size = 14), legend.title = element_text(size = 16))
ggplot(subset(df_category_correlations, type == 'TV Show'), aes(x = Category1, y = Category2, fill = matched_count)) +
geom_tile() + facet_wrap( ~type, scales = 'free') + scale_fill_distiller(palette = "Spectral") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), , legend.text = element_text(size = 14), legend.title = element_text(size = 16))
4.9 Жүжигчидтэй холбоотой өгөгдөл
df %>% select(c('show_id','cast','director')) %>%
gather(key = 'role', value = 'person', cast, director) %>%
filter(person != "") %>% separate_rows(person, sep = ',') -> df_show_people
df_show_people$person <- trimws(df_show_people$person)
head(df_show_people)
df_people_freq<- df_show_people %>% group_by(person,role) %>%
summarise(count = n()) %>% arrange(desc(count))
`summarise()` has grouped output by 'person'. You can override using the `.groups` argument.
df_people_freq %>% group_by(role) %>% top_n(10,count) %>% ungroup() %>% ggplot(aes(x = fct_reorder(person,count,.desc = T), y = count, fill = role)) +
geom_bar(stat = 'identity') + scale_x_discrete() + facet_wrap(~role, scales = 'free_x') + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),legend.position = 'none') + labs(x = 'жүжигчин болон зохиолчдын нэрс')